library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dat <- read.csv('mobile-food-sf.csv', stringsAsFactors = FALSE)
Let’s begin processing the values in column starttime. The goal is to obtain new times in 24 hr format. For example, a starting time of 10AM will be transformed to 10:00. Likewise, a starting time of 1PM will be transformed to 13:00.
# toy string
time1 <- '10AM'
To get the time and period values, you can use str_sub():
# hour
str_sub(time1, start = 1, end = 2)
## [1] "10"
# period
str_sub(time1, start = 3, end = 4)
## [1] "AM"
Your turn: What about times where the hour has just one digit? For example: 9AM, or 8AM? Create the following vector times and try to subset the hour and the periods with str_sub()
times <- c('12PM', '10AM', '9AM', '8AM')
# subset time
time_sub <- str_sub(times, start = 1, end = nchar(times)- 2)
time_sub
## [1] "12" "10" "9" "8"
# subset period
period <- str_sub(times, start = -2)
period
## [1] "PM" "AM" "AM" "AM"
The tricky part with the vector times is the extraction of the hour. One solution is to “remove” the characters AM or PM from each time. You can do this with the substitution function str_replace():
str_replace(times, pattern = 'AM|PM', replacement = '')
## [1] "12" "10" "9" "8"
Your Turn
So far you’ve managed to get the hour value and the period. Notice that you still need to convert the hours as numeric vectors:
hours <- as.numeric(str_replace(times, pattern = 'AM|PM', replacement = ''))
periods <- str_sub(times, start = -2)
Transform the hours into 24 hr values. Write R code to create a vector start24 that contains the hour in 24hr scale.
start24 <- rep(0, length(times))
for(i in 1:length(times)){
if(periods[i] == "AM"){
start24[i] = hours[i]
}else{
start24[i] = hours[i] + 12
}
}
start24
## [1] 24 10 9 8
Add two columns start and end to the data frame dat, containing the starting and ending hour respectively (columns must be “numeric”).
to24h <- function(x){
hours <- as.numeric(str_replace(x, pattern = 'AM|PM', replacement = ''))
periods <- str_sub(x, start = -2)
start24 <- rep(0, length(x))
for(i in 1:length(x)){
if(periods[i] == "AM"){
start24[i] = hours[i]
}else{
start24[i] = hours[i] + 12
}
}
return(start24)
}
# add 'start' hours
dat$start <- to24h(dat$starttime)
# add 'end' hours
dat$end <- to24h(dat$endtime)
With the starting and ending hours, calculate the duration, and add one more column duration to the data frame dat:
dat$duration <- dat$end - dat$start
for(i in 1:length(dat$duration)){
if(dat$duration[i] < 0 ){
dat$duration[i] = dat$duration[i] + 24
}
}
loc1 <- "(37.7651967350509,-122.416451692902)"
The goal is to split Location into latitude and longitude. The first value corresponds to latitude, while the second value corresponds to longitude.
First we need to remove the parenthesis. The issue here is that the characters ( and ) have special meanings; recall they are metacharacters. So you need to escape in R them by pre-appending two backslashes: \( and \)
# "remove" opening parenthesis
str_replace(loc1, pattern = '\\(', replacement = '')
## [1] "37.7651967350509,-122.416451692902)"
# "remove" closing parenthesis
str_replace(loc1, pattern = '\\)', replacement = '')
## [1] "(37.7651967350509,-122.416451692902"
You can also combine both patterns in a single call. But be careful: str_replace() replaces only the first occurrence of ( or ). However, the location values contain both opening and closing parentheses. To replace them all, you have to use str_replace_all()
lat_lon <- str_replace_all(loc1, pattern = '\\(|\\)', replacement = '')
Now we need to get rid of the comma ,.
Instead of replacing the comma, what we need to use is str_split()
# string split in stringr
str_split(lat_lon, pattern = ',')
## [[1]]
## [1] "37.7651967350509" "-122.416451692902"
Let’s define a vector with more location values, so we can start generalizing our code:
locs <- c(
"(37.7651967350509,-122.416451692902)",
"(37.7907890558203,-122.402273431333)",
"(37.7111991003088,-122.394693339395)",
"(37.7773000262759,-122.394812784799)",
NA
)
create a list lat_lon containing the latitude and the longitude values of locs
lat_lon <- str_replace_all(locs, pattern = '\\(|\\)', replacement = '')
lat_lon <- str_split(lat_lon, pattern = ",")
lat_lon
## [[1]]
## [1] "37.7651967350509" "-122.416451692902"
##
## [[2]]
## [1] "37.7907890558203" "-122.402273431333"
##
## [[3]]
## [1] "37.7111991003088" "-122.394693339395"
##
## [[4]]
## [1] "37.7773000262759" "-122.394812784799"
##
## [[5]]
## [1] NA
to retrieve the latitude and longitude values, you can use the lapply() function, and then specify an anonymous function to get the first element (for the latitude):
lat <- lapply(lat_lon, function(x) x[1])
lat
## [[1]]
## [1] "37.7651967350509"
##
## [[2]]
## [1] "37.7907890558203"
##
## [[3]]
## [1] "37.7111991003088"
##
## [[4]]
## [1] "37.7773000262759"
##
## [[5]]
## [1] NA
Your Turn
Create a list lon by using lapply() with an anonymous function to extract longitude value (i.e. the second element):
lon <- lapply(lat_lon, function(x) x[2])
To convert from list to a vector, use unlist()
latitute <- as.numeric(unlist(lat))
longitude <- as.numeric(unlist(lon))
Add two more columns: lat and lon to the data frame dat
dat_lat_lon <- str_replace_all(dat$Location, pattern = '\\(|\\)', replacement = '')
dat_lat_lon <- str_split(dat_lat_lon, pattern = ",")
dat_lat <- lapply(dat_lat_lon, function(x) x[1])
dat_lon <- lapply(dat_lat_lon, function(x) x[2])
dat$lat <- as.numeric(unlist(dat_lat))
dat$lon <- as.numeric(unlist(dat_lon))
Now that you have two vectors latitude and longitude, and the corrsponding columns lat and lon in the data frame dat, let’s try to plot those coordinates on a map.
A naive option would be to graph the locations with plot():
plot(dat$lon, dat$lat, pch = 19, col = "#77777744")
Althouth the previous call shows the dots with the right latitude and longitude coordinates, there’s no visual cues that let us perceive the information in a geographical way.
Instead of displaying a naked plot(), we can use the package “RgoogleMaps” which is one the several packages available in R to plot maps.
# install.packages("RgoogleMaps")
library(RgoogleMaps)
## Warning: package 'RgoogleMaps' was built under R version 3.4.2
To get a map you use the function GetMap() which requires a center and a zoom specifications. The center is a vector with the latitude and longitude coordinates. The argument zoom refers to the zoom level.
# coordinates for center of the map
center <- c(mean(dat$lat, na.rm = TRUE), mean(dat$lon, na.rm = TRUE))
# zoom value
zoom <- min(MaxZoom(range(dat$lat, na.rm = TRUE),
range(dat$lon, na.rm = TRUE)))
# san francisco map
map1 <- GetMap(center=center, zoom=zoom, destfile = "san-francisco.png")
The code above downloads a static map from the Google server and saves it in the specified destination file. To make a plot you have to use PlotOnStaticMap()
PlotOnStaticMap(map1, dat$lat, dat$lon, col = "#ed4964", pch=20)
Another useful package for plotting maps is “ggmap”. As you may guess, “ggmap” follows the graphing approach of “ggplot2”.
As usual, you need to install the package:
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.4.2
## Loading required package: ggplot2
Because some rows have missing values in the geographical coordinates, we can get rid of them with ’na.omit():
# let's get rid of rows with missing values
dat <- na.omit(dat)
In order to plot a map with ggmap(), we need to define the region of the map via the function make_bbox():
# ggmap typically asks you for a zoom level,
# but we can try using ggmap's make_bbox function:
sbbox <- make_bbox(lon = dat$lon, lat = dat$lat, f = .1)
sbbox
## left bottom right top
## -122.48867 37.69985 -122.36281 37.81595
Now that you have the object sbbox, the next step is to get a map with get_map(). This function gets a map from Google by default.
# get a 'terrain' map
sf_map <- get_map(location = sbbox, maptype = "terrain", source = "google")
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.757897,-122.425744&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
Having obtained the sf_map object, we can finally use ggmap() to plot some dots with our lat and lon coordinates:
ggmap(sf_map) +
geom_point(data = dat,
mapping = aes(x = lon, y = lat),
color = "red", alpha = 0.2, size = 1)
## Warning: Removed 98 rows containing missing values (geom_point).
What if you want to identify all locations that have burritos? This is where regular expressions comes very handy. Again, always start small: select the first 10 elements of optionaltext
foods <- dat$optionaltext[1:10]
Use str_detect() (or equivalently grep()) to match “Burritos” and “burritos”.
str_detect(foods, pattern = "Burritos|burritos")
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Try another pattern: e.g. “tacos”, or “quesadillas”
str_detect(foods, pattern = "tacos|Tacos|quesadillas|Quesadillas")
## [1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
Now create a data frame burritos by subsetting (i.e. filtering) the data frame to get only those rows that match “burritos”
burritos <- dat %>%
filter(str_detect(optionaltext, pattern = "burritos"))
Use the lat and lon corrdinates in burritos to display a map of locations with burritos (see map below).
#define the region of the map
burritosbox <- make_bbox(burritos$lon, burritos$lat, f = 0.1)
burritosbox
## left bottom right top
## -122.48776 37.70111 -122.37289 37.80205
#get a map with get_map()
burritosmap <- get_map(location = burritosbox, maptype = "terrain", source = "google")
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.751581,-122.430325&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
#use ggmap() to plot some dots
ggmap(burritosmap) +
geom_point(data = burritos,
mapping = aes(x = lon, y = lat),
color = "blue", alpha = 0.2, size = 1)
Here’s a vector with some character strings:
# vector of strings
animals <- c('dog', 'cat', 'bird', 'dolphin', 'lion',
'zebra', 'tiger', 'wolf', 'whale', 'eagle',
'pig', 'osprey', 'kangaroo', 'koala')
Let’s match the pattern dog with grep()
grep('dog', animals)
## [1] 1
To extract the matched pattern, you can use grep() with the argument value = TRUE:
grep('dog', animals, value = TRUE)
## [1] "dog"
Using “stringr” functions, you can use str_detect() to check if there is a match:
str_detect(animals, 'dog')
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE
To extract the matched pattern, you use str_extract():
str_extract(animals, 'dog')
## [1] "dog" NA NA NA NA NA NA NA NA NA NA
## [12] NA NA NA
What if you just want to extract the matched pattern and nothing else? Then use logical subsetting:
animals[str_detect(animals, 'dog')]
## [1] "dog"
Your turn
Use str_extract(), and logical subsetting with str_detect(), to find the names of animals with: - zero or more o
animals[str_detect(animals, "o*")]
## [1] "dog" "cat" "bird" "dolphin" "lion" "zebra"
## [7] "tiger" "wolf" "whale" "eagle" "pig" "osprey"
## [13] "kangaroo" "koala"
animals[str_detect(animals, "o?")]
## [1] "dog" "cat" "bird" "dolphin" "lion" "zebra"
## [7] "tiger" "wolf" "whale" "eagle" "pig" "osprey"
## [13] "kangaroo" "koala"
animals[str_detect(animals, "o+")]
## [1] "dog" "dolphin" "lion" "wolf" "osprey" "kangaroo"
## [7] "koala"
animals[str_detect(animals, "o{2}")]
## [1] "kangaroo"
animals[str_detect(animals, "o[^o]")]
## [1] "dog" "dolphin" "lion" "wolf" "osprey" "koala"
animals[str_detect(animals, "[aeiou][aeiou]")]
## [1] "lion" "eagle" "kangaroo" "koala"
animals[str_detect(animals, "[^aeiou][^aeiou]")]
## [1] "bird" "dolphin" "zebra" "wolf" "whale" "eagle"
## [7] "osprey" "kangaroo"
animals[str_detect(animals, "[^aeiou]{3}")]
## [1] "dolphin" "osprey"
animals[str_detect(animals, "\\b[a-z]{3}\\b")]
## [1] "dog" "cat" "pig"
animals[str_detect(animals, "\\b[a-z]{4}\\b")]
## [1] "bird" "lion" "wolf"
Here’s another character vector with some file names and their extensions:
files <- c('sales1.csv', 'orders.csv', 'sales2.csv',
'sales3.csv', 'europe.csv', 'usa.csv', 'mex.csv',
'CA.csv', 'FL.csv', 'NY.csv', 'TX.csv',
'sales-europe.csv', 'sales-usa.csv', 'sales-mex.csv')
Your turn - Find the file names containing numbers
files[str_detect(files, pattern = "\\d")]
## [1] "sales1.csv" "sales2.csv" "sales3.csv"
files[str_detect(files, pattern = "^\\D*$")]
## [1] "orders.csv" "europe.csv" "usa.csv"
## [4] "mex.csv" "CA.csv" "FL.csv"
## [7] "NY.csv" "TX.csv" "sales-europe.csv"
## [10] "sales-usa.csv" "sales-mex.csv"
files[str_detect(str_sub(files, start = 1, end = nchar(files) - 4), pattern = "[a-z]")]
## [1] "sales1.csv" "orders.csv" "sales2.csv"
## [4] "sales3.csv" "europe.csv" "usa.csv"
## [7] "mex.csv" "sales-europe.csv" "sales-usa.csv"
## [10] "sales-mex.csv"
files[str_detect(str_sub(files, start = 1, end = nchar(files) - 4), pattern = "[A-Z]")]
## [1] "CA.csv" "FL.csv" "NY.csv" "TX.csv"
files[str_detect(files, pattern = "-")]
## [1] "sales-europe.csv" "sales-usa.csv" "sales-mex.csv"
files[str_detect(files, pattern = "^[^-]*$")]
## [1] "sales1.csv" "orders.csv" "sales2.csv" "sales3.csv" "europe.csv"
## [6] "usa.csv" "mex.csv" "CA.csv" "FL.csv" "NY.csv"
## [11] "TX.csv"
files_txt <- str_replace(files,pattern = "csv", "txt")
files_txt
## [1] "sales1.txt" "orders.txt" "sales2.txt"
## [4] "sales3.txt" "europe.txt" "usa.txt"
## [7] "mex.txt" "CA.txt" "FL.txt"
## [10] "NY.txt" "TX.txt" "sales-europe.txt"
## [13] "sales-usa.txt" "sales-mex.txt"
str_split(files, pattern = "\\.")
## [[1]]
## [1] "sales1" "csv"
##
## [[2]]
## [1] "orders" "csv"
##
## [[3]]
## [1] "sales2" "csv"
##
## [[4]]
## [1] "sales3" "csv"
##
## [[5]]
## [1] "europe" "csv"
##
## [[6]]
## [1] "usa" "csv"
##
## [[7]]
## [1] "mex" "csv"
##
## [[8]]
## [1] "CA" "csv"
##
## [[9]]
## [1] "FL" "csv"
##
## [[10]]
## [1] "NY" "csv"
##
## [[11]]
## [1] "TX" "csv"
##
## [[12]]
## [1] "sales-europe" "csv"
##
## [[13]]
## [1] "sales-usa" "csv"
##
## [[14]]
## [1] "sales-mex" "csv"
Create a function split_chars() that splits a character string into one single character elements. The output must be a single character vector.
split_chars <- function(x){
list <- str_split(x,"")
return(unlist(list))
}
split_chars('Go Bears!')
## [1] "G" "o" " " "B" "e" "a" "r" "s" "!"
Here’s another exmaple, split_chars(‘Expecto Patronum’) should return:
split_chars('Expecto Patronum')
## [1] "E" "x" "p" "e" "c" "t" "o" " " "P" "a" "t" "r" "o" "n" "u" "m"
Write a function reverse_chars() that reverses a string by characters.
reverse_chars <- function(x) {
letter = split_chars(x)
reverse_letter = rep("", length(letter))
for(i in 1:length(letter)){
reverse_letter[i] = letter[length(letter) + 1 - i]
}
return(paste0(reverse_letter, collapse = ""))
}
reverse_chars("gattaca")
## [1] "acattag"
reverse_chars("Lumox Maxima")
## [1] "amixaM xomuL"